# Open the dataset
d <- read.delim("D:/Teaching/Bayes Reg/Intro to Bayes Regression OPerra/Exercises/private_school.csv")
str(d)
head(d)
summary(d)


### Before you can run these examples, you will need to download and install...
#...the packages below. These also require the installation of Tool Chain ++
# See this webpage for installation instructions: 
# https://github.com/rmcelreath/rethinking#installation 
library(StanHeaders)
library(cmdstanr)
library(rstan)
library(ggplot2)
library(rethinking)
library(psych) 
#package "psych" allows some functions like "describe"

###########################################################################################################
#### 1. Build a linear regression model where math scores in Grade 12 are a function of math scores in Grade 8. #### 
###########################################################################################################

###################################################
#1a. Consider the metric of outcome and predictors: 
#would it be sensible to centre and/or standardise some of these variables?
###################################################
#describe the variables of interest
describe (d[ , c("math12",  "math8") ], fast=TRUE)
# The scores of the predictor are translated in a different metric where the mean 50-->0 and the SD=10-->1
d$smath8<-(d$math8-50)/10
cor.test(d$math12, d$smath8)
describe (d[ , c("math12",  "math8", "smath8") ], fast=TRUE)

###################################################
#1b. Create and check priors for the parameters in the model (intercept, slopes, SD). 
###################################################

#The model can be described as:
#m12 ~ Norm(mu, sigma)
#mu= a + b(m8) http://127.0.0.1:38705/graphics/plot_zoom_png?width=1200&height=900
#a=Norm( Ma, SDa)
#b=Norm( Mb, SDb)
#sigma=?
#Since the math12 "m12" data have been standardised to SD=1, I may give sigma a uniform distribution 
#with values 0 to 3, say. 
#However, given that m12 is assumed normally distributed with SD approx 1, I will expect that values
#close to 1 will be more likely and extreme values increasingly unlikely. At the same time, the distribution
#has to be positive. One possible distribution that meets these demands is exponential with rate=1, so:
#sigma=Exp(1). See a random sample of 1k draws from a similar distribution to get a sense of what it can look like:
example.sigma<-rlnorm(1000, 2, .7)
dens(example.sigma)
describe(example.sigma)
#Given the standardisation of scores, the prior for the intercept a may be also quite simple, say mean=0, SD=1

#visualise this prior
set.seed(27416)
sample_mu<-rnorm(1000, 50, 10)
sample_sigma<-rlnorm(1000, 2, .7)
prior_m1<-rnorm(1000, sample_mu, sample_sigma)
dens(prior_m1)
quantile(prior_m1, c(0.1, 0.90))


#Change it to make it narrower, e.g. SD=5
set.seed(27416)
sample_mu<-rnorm(1000, 50, 5)
sample_sigma<-rlnorm(1000, 2, .7)
prior_m1<-rnorm(1000, sample_mu, sample_sigma)
dens(prior_m1)
quantile(prior_m1, c(0.1, 0.90))

# Assuming that the slope has mean 0 and SD=10, what would the expected slopes look like?
set.seed(27416)
N<-100
a<-rnorm(N, 50, 5 )
b<-rnorm(N, 0, 10)
#Create an empty plot 
plot(NULL, xlim=c(-3,3), ylim=c(0,100), xlab="Math scores Grade 8", ylab="Math scores Grade 12")
mtext("b ~ dnorm(0,10)")
#the function below creates and plots lines using the linear model
for (i in 1:N) curve(a[i] + b[i]*(x), from=-3, to=3, add=TRUE, col=col.alpha("brown3",0.2), lwd=3)

# Assuming that the slope has mean 0 and SD=5, what would the expected slopes look like?
set.seed(4978)
N<-100
a<-rnorm(N, 50, 5)
b<-rnorm(N, 0, 5)
#Create an empty plot 
plot(NULL, xlim=c(-3,3), ylim=c(0,100), xlab="Math scores Grade 8", ylab="Math scores Grade 12")
mtext("b ~ dnorm(0,5)")
#the function below creates and plots lines using the linear model
for (i in 1:N) curve(a[i] + b[i]*(x), from=-3, to=3, add=TRUE, col=col.alpha("brown3",0.2), lwd=3)


###################################################
#2 make sure that you use an ad-hoc list with the variables of interest 
###################################################

### Compile a list with only the two variables of interest. There are no missing data in those variables.
dl <- list(
  math12 = d$math12,
   smath8 = d$smath8 )

###################################################
#2 Run the model using ULAM specifying only 1 chain. 
###################################################

### Run the Model using the function "ulam"
set.seed(96760) #Used to ensure the output is replicated
m1<- ulam(
  alist(
    math12 ~ dnorm( mu, sigma) ,
    mu<- a + b*(smath8),
    a ~ dnorm(50, 5),
    b ~ dnorm(0, 5),
    sigma ~ dlnorm(2, .7)
  ), data=dl, chains=1)
#Check the run of the model
show(m1)
#Check the results in tabular form, as well as some key diagnostics 
precis(m1)
#Run further diagnostics for the algorithm. See "Statistical Rethinking" book...
#...particularly Chapter 9, for more information
traceplot(m1)

###################################################
#3 Run the model using ULAM but specify 6 chains 
###################################################

### Run the Model using the function "ulam", but add more chains
set.seed(96760) #Used to ensure the output is replicated
m1<- ulam(
  alist(
    math12 ~ dnorm( mu, sigma) ,
    mu<- a + b*(smath8),
    a ~ dnorm(50, 5),
    b ~ dnorm(0, 5),
    sigma ~ dlnorm(2, .7)
  ), data=dl, chains=6, core=6)
#Check the run of the model
show(m1)
#Check the results in tabular form, as well as some key diagnostics 
precis(m1)
#Run further diagnostics for the algorithm. See "Statistical Rethinking" book...
#...particularly Chapter 9, for more information
traceplot(m1)
trankplot(m1)

### Estimate variacs and covariances from posterior
round(vcov(m1), 3)

###################################################
#4 .	After inspecting the poste marginal posterior distribution in tabular form,
#plot the observed results, the average slope, 
#the 95% credibility interval around the mean, and the 95% credibility 
#interval of math12 scores. 
###################################################

# Extract samples from the posterior distribution using "rethinking" package...
#...function "extract.samples"
set.seed(835217)
post<-extract.samples(m1)
# Create a sequence of values for the predictor
smath8.seq<-seq(from=-2, to=3, by=.1 )
# The following creates a function to calculate the expected values of dependent
#based on the indipdendent and the liner function parameters from the posterior
mu.link<-function(smath8) post$a + post$b*smath8
#This now calculates the value of "mu" for each value of the sequence smath8
mu<-sapply(smath8.seq, mu.link)
#now I can calculate the mean of these values
mu.mean<-apply(mu, 2, mean)
#and now I can calculate the CIs considering p=.95
mu.ci<-apply(mu,2,PI, prob=0.95)

set.seed(835217)
#create a function to calculate predicted scores of math12 considering parameters from posterior
premath12<-function(smath8) rnorm(3000, (post$a + post$b*smath8) , post$sigma)
#calculate predicted math12 scores based on the function above and the sequence of math8 scores
predict.math12<-sapply(smath8.seq, premath12)
#calculate the 95% CI of these predicted scores
math12.ci<-apply(predict.math12,2,PI, prob=0.95)

# plot the observed data
plot(math12 ~ smath8, data=d,  pch=19, col="firebrick2", xlab="Std Math Scores Grade 8", 
     ylab="Math Scores Grade 12", cex.lab=1.5, cex=1.5)
#### Add information about uncertainty around the central values ####
shade(mu.ci, smath8.seq, col=col.alpha("peachpuff", .75))
### Add the mean of mu
lines(smath8.seq, mu.mean, lwd=6, col="firebrick4")
### add CIs for the predicted scores
shade(math12.ci, smath8.seq, col=col.alpha("#1C5A99", .45))

